About the analysis

This is an analysis for the impact of introducing the lunch program in 1955 the growth trend of the Japanese Boys

https://papers.ssrn.com/sol3/papers.cfm?abstract_id=3398189

The data

The data source is Ministry of Internal Affairs and Communications Statistics Bureau.

www.e-stat.go.jp/SG1/estat/List.do?bid=000001014499&

NOTE: each column represent an age group

data <- read_xlsx("FEH_00400002_221209195404.xlsx")
data[data == "***"] <- NA
datatable(data)

Cleaning the data

Changing the data to long format

data <- data %>% select(year, y17) %>% rename(height = y17) %>% mutate(height =  as.numeric(height)) %>% mutate(year = year - 1900)
datatable(data)

Analysis Steps

1. choose a maximum height (e.g. 200 cm)

max_height <- 175

2. divide all heights by the maximum

data$height_scaled <- data$height / max_height

3. calculate the logistic transform of the divided heights, log(y/(1-y))

data$logit_height <- log(data$height_scaled / (1 - data$height_scaled))

4. Model this data against time

model_all <- lm(logit_height ~ year, data = data)
tbl_regression1 <- tbl_regression(model_all)

5. back transform to get the predicted value at each year

data$predicted_all <- predict(model_all, newdata = data)
data$predicted_all <- exp(data$predicted_all)/(1+exp(data$predicted_all))*max_height
datatable(data)

5b. Creating a Plot

ggplot(data = data, aes(x = year, y = height, colour = "Oberved Heights")) +
  scale_color_manual(labels = c("Observed Heights", "Predicted Data"), values = c("blue", "red")) + geom_line() + geom_line(aes(y = predicted_all, colour = "Predicted Data"))  

6. Model the data only up to the beginning of the war

war_year <- 1940 - 1900
end_war_year <- 1948 - 1900
data_pre_war <- data %>% filter(year <= war_year)
fit_pre_war <- lm(logit_height ~ year, data = data_pre_war)
tbl_regression2 <- tbl_regression(fit_pre_war)

7. project this forward to now

data$predicted_pre_war <- predict(fit_pre_war, newdata = data)
data$predicted_pre_war <- exp(data$predicted_pre_war)/(1+exp(data$predicted_pre_war))*max_height
datatable(data)

7b. Plotting

ggplot(data = data, aes(x = year, y = height, colour = "Oberved Heights")) +
  scale_color_manual(labels = c("Observed Heights", "Predicted Data Full Model", "Pre War Model"), values = c("blue", "red", "black")) + geom_line() + geom_line(aes(y = predicted_all, colour = "Predicted Data"))  + geom_line(aes(y = predicted_pre_war, colour = "Pre War Projection")) 

8. Generate a predicted curve using only post-war data

data_post_war <- data %>% filter(year >= end_war_year)
fit_post_war <- lm(logit_height ~ I(year^2) + year, data = data_post_war)
tbl_regression3 <- tbl_regression(fit_post_war)

data$predicted_post_war <- predict(fit_post_war, newdata = data)
data$predicted_post_war <- exp(data$predicted_post_war)/(1+exp(data$predicted_post_war))*max_height
datatable(data)

8b Fitting a direct model

data_post_war <- data %>% filter(year >= end_war_year)
fit_post_war_direct <- lm(logit_height ~ I(year^2), data = data_post_war)
tbl_regression4 <- tbl_regression(fit_post_war_direct)

data$predicted_post_war_direct <- predict(fit_post_war_direct, newdata = data)
data$predicted_post_war_direct <- exp(data$predicted_post_war_direct)/(1+exp(data$predicted_post_war_direct))*max_height
datatable(data)
tbl_merge(
    tbls = list(tbl_regression1, tbl_regression2, tbl_regression3, tbl_regression4), 
    tab_spanner = c("**Full Data**", "**Pre War**", "Post War Full", "Post War Squared Only") 
  ) 
Characteristic Full Data Pre War Post War Full Post War Squared Only
Beta 95% CI1 p-value Beta 95% CI1 p-value Beta 95% CI1 p-value Beta 95% CI1 p-value
year 0.02 0.01, 0.02 <0.001 0.01 0.01, 0.01 <0.001 0.09 0.08, 0.09 <0.001
I(year^2) 0.00 0.00, 0.00 <0.001 0.00 0.00, 0.00 <0.001
1 CI = Confidence Interval
ggplot(data = data, aes(x = year, y = height, colour = "Oberved Heights")) +
  scale_color_manual(labels = c("Observed Heights", "Direct Fit", "Poly"), values = c("blue", "red", "black")) + 
  geom_line() +
  geom_line(aes(y = predicted_post_war_direct, colour = "Direct Fit")) + 
  geom_line(aes(y = predicted_post_war, colour = "Poly")) 

9. Plotting

data <- data %>% filter(year >= 40)
full_plot <-ggplot(data = data, aes(x = year, y = height, colour = "Oberved Heights")) +
  scale_color_manual(labels = c("Observed Heights", "Predicted Data Full Model", "Pre War Model", "Post War Model Poly", "Post War Direct"), values = c("blue", "red", "black", "green", "grey")) + 
  geom_line() + 
  geom_line(aes(y = predicted_all, colour = "Predicted Data"))  + 
  geom_line(aes(y = predicted_pre_war, colour = "Pre War Projection")) + 
  geom_line(aes(y = predicted_post_war, colour = "Post War Model Poly")) +
    geom_line(aes(y = predicted_post_war, colour = "Post War Direct")) 

ggplotly(full_plot)

Creating a heigh prediction for each age group from 1900-2015

Looping the whole data

data <- read_xlsx("FEH_00400002_221209195404.xlsx")
data[data == "***"] <- NA
datatable(data)
height_list <- list()

for(i in 3:14) {
  pre_col_name <- colnames(data)[i]
  rdata <- data %>% select(year, pre_col_name) %>% rename(height = pre_col_name) %>% 
    mutate(height = as.numeric(height)) %>% mutate(year = year - 1900)
  # Transformation
  max_height <- 175
  rdata$height <- as.numeric(rdata$height)
  rdata$height_scaled <- rdata$height / max_height
  rdata$logit_height <- log(rdata$height_scaled / (1 - rdata$height_scaled))
  # setting war parameters
  war_year <- 1940 - 1900
  end_war_year <- 1948 - 1900
  
  # Pre-war-Model
  rdata_pre_war <- na.omit(rdata %>% filter(year <= war_year))
  rfit_pre_war <- lm(logit_height ~ year, data = rdata_pre_war)
  rdata$predicted_pre_war <- predict(rfit_pre_war, newdata = rdata)
  rdata$predicted_pre_war <- exp(rdata$predicted_pre_war)/(1+exp(rdata$predicted_pre_war))*max_height
  
  # Post_war_model
  data_post_war <- rdata %>% filter(year >= end_war_year)
  fit_post_war <- lm(logit_height ~ I(year^2) + year, data = data_post_war)
  rdata$predicted_post_war <- predict(fit_post_war, newdata = rdata)
  rdata$predicted_post_war <- exp(rdata$predicted_post_war)/(1+exp(rdata$predicted_post_war))*max_height
  
  # assign to list
    rdata <- rdata %>% filter(year >= 40)
  height_list[[i-2]] <- rdata 
}
## Warning: Using an external vector in selections was deprecated in tidyselect 1.1.0.
## ℹ Please use `all_of()` or `any_of()` instead.
##   # Was:
##   data %>% select(pre_col_name)
## 
##   # Now:
##   data %>% select(all_of(pre_col_name))
## 
## See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
# Add a new column to each data frame that represents the number of the data frame in the list
for (i in 1:length(height_list)) {
 height_list[[i]]$df_num <- i
}

df_combined <- do.call(rbind, height_list)
factors_labels <- vector()
for(i in unique(df_combined$df_num)) {
          factors_labels <- c(factors_labels, paste("Age Group", i + 5) )                    
                    }
# Exchanging the number by the age group
df_combined$df_num <- factor(df_combined$df_num, levels = unique(df_combined$df_num),
                             labels = factors_labels
                             )

# adding 1900 to the year
df_combined$year <- df_combined$year + 1900

Creating a plot list

plot_list <- list()

for (i in 1:length(height_list)) {
  df <- na.omit(height_list[[i]])
  p <- ggplot(data = height_list[[i]], aes(x = year, y = height, colour = "Oberved Heights")) +
  scale_color_manual(labels = c("Observed Heights", "Pre War", "Post War"), values = c("blue", "red", "black")) + 
  geom_line() +
  geom_line(aes(y = predicted_pre_war, colour = "Pre War")) + 
  geom_line(aes(y = predicted_post_war, colour = "Post War")) 

  plot_list[[i]] <- p
  i <- i + 1
}
lapply(seq_along(plot_list), function(i) {
  ggsave(plot_list[[i]], file = paste(i, ".png", sep = ""))
})
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## [[1]]
## [1] "1.png"
## 
## [[2]]
## [1] "2.png"
## 
## [[3]]
## [1] "3.png"
## 
## [[4]]
## [1] "4.png"
## 
## [[5]]
## [1] "5.png"
## 
## [[6]]
## [1] "6.png"
## 
## [[7]]
## [1] "7.png"
## 
## [[8]]
## [1] "8.png"
## 
## [[9]]
## [1] "9.png"
## 
## [[10]]
## [1] "10.png"
## 
## [[11]]
## [1] "11.png"
## 
## [[12]]
## [1] "12.png"

Creating one major plot

ggplot(df_combined, aes(x = year, y = height, color = df_num)) + 
  geom_line(aes(group = df_num)) +
  scale_x_continuous(limits = c(min(df_combined$year), max(df_combined$year))) +
  scale_y_continuous(limits = c(min(df_combined$height), max(df_combined$height))) +
  labs(x = "Year", y = "Height") + 
  scale_color_discrete(name = "Age Group") +
  theme(legend.title = element_text(size = 12),
        legend.text = element_text(size = 10))

Getting the Meeting Point

height_list <- list()

for(i in 3:14) {
  pre_col_name <- colnames(data)[i]
  rdata <- data %>% select(year, pre_col_name) %>% rename(height = pre_col_name) %>% 
    mutate(height = as.numeric(height)) %>% mutate(year = year - 1900)
  # Transformation
  max_height <- 175
  rdata$height <- as.numeric(rdata$height)
  rdata$height_scaled <- rdata$height / max_height
  rdata$logit_height <- log(rdata$height_scaled / (1 - rdata$height_scaled))
  # setting war parameters
  war_year <- 1940 - 1900
  end_war_year <- 1948 - 1900
  
  # Pre-war-Model
  rdata_pre_war <- na.omit(rdata %>% filter(year <= war_year))
  rfit_pre_war <- lm(logit_height ~ year, data = rdata_pre_war)
  rdata$predicted_pre_war <- predict(rfit_pre_war, newdata = rdata)
  rdata$predicted_pre_war <- exp(rdata$predicted_pre_war)/(1+exp(rdata$predicted_pre_war))*max_height
  
  # Post_war_model
  data_post_war <- rdata %>% filter(year >= end_war_year)
  fit_post_war <- lm(logit_height ~ I(year^2) + year, data = data_post_war)
  rdata$predicted_post_war <- predict(fit_post_war, newdata = rdata)
  rdata$predicted_post_war <- exp(rdata$predicted_post_war)/(1+exp(rdata$predicted_post_war))*max_height
  
  # assign to list
  height_list[[i-2]] <- rdata 
}

meeting_point <- data.frame(age = 6:17, meeting_point = NA)
for(i in 1:length(height_list)){
  df <- height_list[[i]]
  min_index <- which.min(abs(df$predicted_pre_war - df$predicted_post_war))
  meeting_point[i,2] <- df[min_index, ]$year
}
library("ggthemes")
ggplot(data = meeting_point, aes(x = age, y = meeting_point)) +
  geom_point() +
  theme_economist() +
    labs(x = "Age", y = "Meeting Point (Year)")

Creating the growth Curve for Each Cohort of Children

data <- read_xlsx("FEH_00400002_221209195404.xlsx")
data <- arrange(data, year)
data[data == "***"] <- NA
new_data <- data[, -c(1,2)]
new_data <- as.matrix(new_data)
diag_list <- list()
for(i in 1:nrow(new_data)) {
  diag_list[[i]] <- diag(new_data[i:nrow(new_data),])
}

df_list <- list()
age_vec <- 6:17
for (i in 1:length(diag_list)) {
  df_list[[i]] <- data.frame(age=age_vec[1:length(diag_list[[i]])], height = diag_list[[i]])
}

Plotting

ylim <- range(unlist(lapply(df_list, function(x) x$height)))

saveGIF({
  for (i in 1:length(df_list)) {
    plot(df_list[[i]]$age, df_list[[i]]$height, type="l", xlab="Age", ylab="Height",
         ylim=c(floor(as.numeric(ylim[1])), ceiling(as.numeric(ylim[2]))), main=paste("Year:", 1900 + i),
         col=i, lwd=2, las=1, cex.main=1.5, cex.lab=1.5)
    abline(h=mean(df_list[[i]]$height), lty=2, col="gray")
  }
}, interval=0.5, movie.name="growth_curves.gif")
img <- image_read("growth_curves.gif")
image_animate(img)